
#define vec2 float2
#define vec3 float3
#define vec4 float4
#define rgb xyz
#define rgba xyzw
#define _max(a,b) (a)>(b)?(a):(b)
#define _min(a,b) (a)<(b)?(a):(b)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
vec4 INPUT(image2d_t src_data,  __global FilterParam* param, vec2 tc)
{
	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, tc).zyxw;;
}

vec4 INPUTTEXTURE(image2d_t src_data, vec2 tc)
{
	//tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, (vec2)(tc.x, tc.y)).zyxw;
}

__kernel void MAIN(
      __read_only image2d_t src_data,
	  __read_only image2d_t overlay1,
	  __read_only image2d_t overlay2,
	  __read_only image2d_t overlay3,
      __write_only image2d_t dest_data,        //Data in global memory
       __global FilterParam* param,
	  int alpha //range[0 - 100]
	 ) //range[0 - 100] // the gpu items/threads should be newW*newH
{
	int W = get_global_size(0);
	int H = get_global_size(1);
	int textH = param->height[0];;
	float iGlobalTime = param->cur_time / param->total_time;
	
	int2 coordinate = (int2)(get_global_id(0), get_global_id(1));
	vec2 fragCoord = (vec2)(get_global_id0( param), get_global_id1( param));
	float2 iResolution = (float2)(W,H);
	vec2 tc = (fragCoord + (float2)(0.5f)) / iResolution;
	
	vec4 orig = INPUT(src_data, param, tc);
	vec4 texel = orig;
    vec3 bbTexel = INPUTTEXTURE(overlay1,(vec2)(tc.x, 1.0f - tc.y)).xyz;
     
    texel.x = INPUTTEXTURE(overlay2, (vec2)(bbTexel.x, texel.x)).x;
    texel.y = INPUTTEXTURE(overlay2, (vec2)(bbTexel.y, texel.y)).y;
    texel.z = INPUTTEXTURE(overlay2, (vec2)(bbTexel.z, texel.z)).z;
    
    vec4 mapped;
    mapped.x = INPUTTEXTURE(overlay3, (vec2)(texel.x, .16666f)).x;
    mapped.y = INPUTTEXTURE(overlay3, (vec2)(texel.y, .5f)).y;
    mapped.z = INPUTTEXTURE(overlay3, (vec2)(texel.z, .83333f)).z;
    mapped.w = orig.w;
	
	vec4 output = (vec4)(orig.zyx*(1.0f - (float)alpha/100.0f) + mapped.zyx*(float)alpha/100.0f, orig.w);
	
    write_imagef(dest_data, coordinate, output );
}